There are more and more bioconductor packages supporting single-cell data analysis. R Amezquita, A Lun, S Hicks, and R Gottardo wrote an integrated workflow, Orchestrating Single-Cell Analysis with Bioconductor, for single-cell data analysis and quality accessment. In this session, we will go through several important QC metrics which can’t be made with Seurat.
## class: SingleCellExperiment
## dim: 27998 100000
## metadata(1): Samples
## assays(1): counts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(100000): CGTGTCTTCGCATGAT-1 TTTATGCGTCGAACAG-1 ...
## TTATGCTGTGTTTGTG-1 GCTGCGACACGTTGGC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):
## DataFrame with 2 rows and 2 columns
## Sample Barcode
## <character> <character>
## CGTGTCTTCGCATGAT-1 ~/Desktop/01_scSeq_t.. CGTGTCTTCGCATGAT-1
## TTTATGCGTCGAACAG-1 ~/Desktop/01_scSeq_t.. TTTATGCGTCGAACAG-1
## DataFrame with 2 rows and 2 columns
## ID Symbol
## <character> <character>
## ENSMUSG00000051951 ENSMUSG00000051951 Xkr4
## ENSMUSG00000089699 ENSMUSG00000089699 Gm1992
## DataFrame with 2 rows and 3 columns
## rank total fitted
## <numeric> <numeric> <numeric>
## CGTGTCTTCGCATGAT-1 41200.5 1 NA
## TTTATGCGTCGAACAG-1 41200.5 1 NA
uniq <- !duplicated(bcrank$rank)
#
plot(bcrank$rank[uniq], bcrank$total[uniq], log = "xy", xlab = "Rank", ylab = "Total UMI count",
cex.lab = 1.2)
abline(h = metadata(bcrank)$inflection, col = "darkgreen", lty = 2)
abline(h = metadata(bcrank)$knee, col = "dodgerblue", lty = 2)
legend("bottomleft", legend = c("Inflection", "Knee"), col = c("darkgreen", "dodgerblue"),
lty = 2, cex = 1.2)## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted from
## logarithmic plot
set.seed(100)
limit <- 100
e.out <- emptyDrops(counts(sce), lower = limit, test.ambient = TRUE)
#
e.out## DataFrame with 100000 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## CGTGTCTTCGCATGAT-1 1 -5.34170 0.70912909 FALSE 0.95697174
## TTTATGCGTCGAACAG-1 1 -6.44503 0.50414959 FALSE 0.92603663
## CATGACACAAGTCTGT-1 0 NA NA NA NA
## ACATACGCACCACCAG-1 58 -266.20222 0.01839816 FALSE 0.45640979
## CGTCCATAGCTGCAAG-1 1601 -2917.18710 0.00009999 TRUE 0.00507393
## ... ... ... ... ... ...
## GTGCTTCGTCACCCAG-1 0 NA NA NA NA
## GGAAAGCCAAGGCTCC-1 1 -9.06371 0.20007999 FALSE 0.818381
## TGCCCTAGTAGCTCCG-1 1 -13.51757 0.00309969 FALSE 0.125927
## TTATGCTGTGTTTGTG-1 0 NA NA NA NA
## GCTGCGACACGTTGGC-1 0 NA NA NA NA
## Mode FALSE TRUE NA's
## logical 54010 591 45399
# Concordance by testing with FDR and limited
table(Sig = e.out$FDR <= 0.001, Limited = e.out$Limited)## Limited
## Sig FALSE TRUE
## FALSE 53525 485
## TRUE 1 590
library(scran)
library(scuttle)
library(scater)
clusters <- quickCluster(sce2)
sce2 <- computeSumFactors(sce2, cluster = clusters)
sce2 <- logNormCounts(sce2)
sce2## class: SingleCellExperiment
## dim: 27998 591
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(27998): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ID Symbol
## colnames(591): GCTCCTAAGGCACATG-1 GTAACTGTCGGATGGA-1 ...
## CACCAGGTCACCTTAT-1 AGGTCATCATGTAAGA-1
## colData names(3): Sample Barcode sizeFactor
## reducedDimNames(0):
## altExpNames(0):
g <- buildSNNGraph(sce2, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce2) <- factor(clust)
#
colData(sce2)## DataFrame with 591 rows and 4 columns
## Sample Barcode sizeFactor
## <character> <character> <numeric>
## GCTCCTAAGGCACATG-1 ~/Desktop/01_scSeq_t.. GCTCCTAAGGCACATG-1 0.913482
## GTAACTGTCGGATGGA-1 ~/Desktop/01_scSeq_t.. GTAACTGTCGGATGGA-1 0.970723
## CGGACACCATTACGAC-1 ~/Desktop/01_scSeq_t.. CGGACACCATTACGAC-1 0.725295
## GCGAGAAAGCTACCGC-1 ~/Desktop/01_scSeq_t.. GCGAGAAAGCTACCGC-1 0.700158
## AAACGGGCAGATGGGT-1 ~/Desktop/01_scSeq_t.. AAACGGGCAGATGGGT-1 1.636871
## ... ... ... ...
## GTGGGTCAGCACCGTC-1 ~/Desktop/01_scSeq_t.. GTGGGTCAGCACCGTC-1 1.026750
## CGCTATCAGTACGACG-1 ~/Desktop/01_scSeq_t.. CGCTATCAGTACGACG-1 0.677916
## GTACTTTTCCTTTCGG-1 ~/Desktop/01_scSeq_t.. GTACTTTTCCTTTCGG-1 0.978623
## CACCAGGTCACCTTAT-1 ~/Desktop/01_scSeq_t.. CACCAGGTCACCTTAT-1 1.130759
## AGGTCATCATGTAAGA-1 ~/Desktop/01_scSeq_t.. AGGTCATCATGTAAGA-1 1.689680
## label
## <factor>
## GCTCCTAAGGCACATG-1 2
## GTAACTGTCGGATGGA-1 2
## CGGACACCATTACGAC-1 2
## GCGAGAAAGCTACCGC-1 6
## AAACGGGCAGATGGGT-1 4
## ... ...
## GTGGGTCAGCACCGTC-1 6
## CGCTATCAGTACGACG-1 6
## GTACTTTTCCTTTCGG-1 1
## CACCAGGTCACCTTAT-1 7
## AGGTCATCATGTAAGA-1 4
# extrat potential ambient RNA and thee estimated score
amb <- metadata(e.out)$ambient[, 1]
head(amb)## ENSMUSG00000051951 ENSMUSG00000025902 ENSMUSG00000033845 ENSMUSG00000025903
## 5.808442e-07 5.808442e-07 1.067188e-04 6.783426e-05
## ENSMUSG00000033813 ENSMUSG00000002459
## 5.876210e-05 5.808442e-07
library(scater)
stripped <- sce2[names(amb), ]
out <- removeAmbience(counts(stripped), ambient = amb, groups = colLabels(stripped))| ## Integrate corrected counts |
r counts(stripped, withDimnames = FALSE) <- out stripped <- logNormCounts(stripped) |
| ## Before/After removement - Hemoglobin A1 (Hba-a1) as example - in most cases the Hbs are contaminated from residual RBCs |
r ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Hba-a1"] plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before") plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("After") |
| ## Before/After removement |
| ## Before/After removement - Krt17 as example |
r ensmbl_id <- rowData(sce2)$ID[rowData(sce2)$Symbol == "Krt17"] plotExpression(sce2, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("Before") plotExpression(stripped, x = "label", colour_by = "label", features = ensmbl_id) + ggtitle("After") |
| ## Before/After removement |
| ## Save result |
r saveRDS(stripped, "scSeq_CTRL_sceSub_rmAmbRNA.rds") |
| # Remove doublets |
dec <- modelGeneVar(stripped)
hvgs <- getTopHVGs(dec, n = 1000)
stripped <- runPCA(stripped, ncomponents = 10, subset_row = hvgs)
stripped <- runUMAP(stripped, dimred = "PCA")
g <- buildSNNGraph(stripped, k = 10, use.dimred = "PCA")
clust <- igraph::cluster_walktrap(g)$membership
colLabels(stripped) <- factor(clust)
plotUMAP(stripped, colour_by = "label")dbl.dens <- computeDoubletDensity(stripped, #subset.row=top.mam,
d=ncol(reducedDim(stripped)),subset.row=hvgs)
summary(dbl.dens)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2908 0.7978 1.1442 1.2442 1.6010 4.1961
plotColData(stripped, x = "label", y = "DoubletScore", colour_by = "label") + geom_hline(yintercept = quantile(colData(stripped)$DoubletScore,
0.95), lty = "dashed", color = "red") - No clusters have significantly higher doublet scores than other clusters.No clusters would be removed. - Red dash line representd 95% quantile of doublet score. The cells with higher doublrt score than this cut-off would be removed.
plotColData(sce_clean, x = "label", y = "subsets_Mito_percent", colour_by = "label") +
ggtitle("mitocondrial content")plotColData(sce_clean, x = "sum", y = "subsets_Mito_percent", colour_by = "label") +
ggtitle("is.mito vs read counts")plotColData(sce_clean, x = "sum", y = "detected", colour_by = "label") + ggtitle("gene counts vs read counts")vars <- getVarianceExplained(sce_clean, variables = c("DoubletScore", "label", "sum",
"detected", "subsets_Mito_percent"))
plotExplanatoryVariables(vars)## Warning: Removed 2765 rows containing non-finite values (stat_density).